import sqlite3
import os, time, re, sys
import json
import pandas as pd
import random
import multiprocessing as mp
from functools import partial

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

from utils.config import load_config
from utils.logger import set_logger

# CONFIG_PATH = '../config/comments_config.yml'
# LOG_PATH = '../logs/chunk_uk_202104_202203.log'
# Load the configuration file
config = load_config()
logger = set_logger()

# Setting the working directory to the directory of the script
os.chdir(os.path.dirname(__file__))

# CONSTANTS
START_DATETIME = config['start_datetime']  # '2023-01-01T00:00:00Z'
END_DATETIME = config['end_datetime']  # '2023-12-31T23:59:59Z'
COUNTRY_NAME = config['country_name']  # e.g. "mexico"
COUNTRY_CODE = config['country_code']  # e.g. "mx"

# Convert the start and end datetimes to the format `YYYYMM`
START_YEAR_MONTH = re.sub(r'[^0-9]', '', START_DATETIME[:7])
END_YEAR_MONTH = re.sub(r'[^0-9]', '', END_DATETIME[:7])

# PATHS
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
INPUT_DIR = os.path.join(PROJECT_ROOT, '../../../data', COUNTRY_NAME)
OUTPUT_DIR = os.path.join(PROJECT_ROOT, '../../../output', COUNTRY_NAME)
VIDEO_DB_PATH = os.path.join(OUTPUT_DIR, f'{START_YEAR_MONTH[:4]}-{START_YEAR_MONTH[4:]}-collection/{START_YEAR_MONTH}_{END_YEAR_MONTH}_02_{COUNTRY_CODE}_youtube_data.db')


def chunk_table(db_file, table_name, chunk_size, primary_key):
    """
    Chunk a SQLite table into smaller tables with N random records each.

    Args:
        db_file (str): Path to the SQLite database file.
        table_name (str): Name of the table to chunk.
        chunk_size (int): Number of records per chunk.
    """
    logger.info(f"Connecting to database: {db_file}")
    # Use a context manager to handle the database connection
    with sqlite3.connect(db_file) as conn:
        cursor = conn.cursor()

        try:
            # Fetch all record IDs
            logger.info(f"Fetching all row IDs from table: {table_name}")
            cursor.execute(f"SELECT {primary_key} FROM {table_name}")
            all_ids = [row[0] for row in cursor.fetchall()]
            logger.info(f"Total records fetched: {len(all_ids)}")

            # Shuffle the IDs to randomize the records
            logger.info("Shuffling records...")
            random.seed(42)  # Set a random seed for reproducibility
            random.shuffle(all_ids)

            # Split IDs into chunks
            logger.info(f"Splitting records into chunks of size: {chunk_size}")
            chunks = [all_ids[i:i + chunk_size] for i in range(0, len(all_ids), chunk_size)]
            logger.info(f"Total chunks created: {len(chunks)}")

            # Create new tables for each chunk
            for i, chunk in enumerate(chunks):
                chunk_table_name = f"{table_name}_chunk_{i+1}"

                logger.info(f"Creating table: {chunk_table_name}")
                # Create the chunk table with the same schema as the original table
                cursor.execute(f"CREATE TABLE {chunk_table_name} AS SELECT * FROM {table_name} WHERE 1=0")

                logger.info(f"Inserting {len(chunk)} records into table: {chunk_table_name}")
                # Insert records into the chunk table in smaller sub-batches if necessary
                batch_size = 50  # Define a smaller batch size
                for j in range(0, len(chunk), batch_size):
                    sub_batch = chunk[j:j + batch_size]
                    id_placeholders = ','.join('?' for _ in sub_batch)
                    cursor.execute(f"INSERT INTO {chunk_table_name} SELECT * FROM {table_name} WHERE {primary_key} IN ({id_placeholders})", sub_batch)

                logger.info(f"Created table {chunk_table_name} with {len(chunk)} records.")

                # Commit after each chunk creation to ensure resilience
                logger.info("Committing changes after creating chunk table...")
                conn.commit()

        except Exception as e:
            logger.info(f"An error occurred: {e}")

# Example usage
if __name__ == "__main__":
    db_file = VIDEO_DB_PATH  # Path to your .db file
    table_name = "videos"       # Name of the table to chunk
    chunk_size = 10000          # Number of records per chunk
    primary_key = "video_id"    # Primary key column name

    logger.info("Starting chunking process...")
    chunk_table(db_file, table_name, chunk_size, primary_key)
    logger.info("Chunking process completed.")
